I’m expect to learn how to do Rstudio with huge data. In addition, I want to know about how to visualize the data properly.
Link: https://github.com/swinesci/IODS-project
I have a problem with chapter 2
Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you can use R Markdown syntax.
df <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/learning2014.txt", sep=",", header=TRUE)
str(df)
## 'data.frame': 166 obs. of 7 variables:
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
## $ age : int 53 55 49 53 49 38 50 37 37 42 ...
## $ attitude: num 3.7 3.1 2.5 3.5 3.7 3.8 3.5 2.9 3.8 2.1 ...
## $ deep : num 3.58 2.92 3.5 3.5 3.67 ...
## $ stra : num 3.38 2.75 3.62 3.12 3.62 ...
## $ surf : num 2.58 3.17 2.25 2.25 2.83 ...
## $ points : int 25 12 24 10 22 21 21 31 24 26 ...
dim(df)
## [1] 166 7
library(ggplot2)
ggplot(df)
pairs(df[-1])
## to access the GGally and ggplot2
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
summary(df)
## gender age attitude deep stra
## F:110 Min. :17.00 Min. :1.400 Min. :1.583 Min. :1.250
## M: 56 1st Qu.:21.00 1st Qu.:2.600 1st Qu.:3.333 1st Qu.:2.625
## Median :22.00 Median :3.200 Median :3.667 Median :3.188
## Mean :25.51 Mean :3.143 Mean :3.680 Mean :3.121
## 3rd Qu.:27.00 3rd Qu.:3.700 3rd Qu.:4.083 3rd Qu.:3.625
## Max. :55.00 Max. :5.000 Max. :4.917 Max. :5.000
## surf points
## Min. :1.583 Min. : 7.00
## 1st Qu.:2.417 1st Qu.:19.00
## Median :2.833 Median :23.00
## Mean :2.787 Mean :22.72
## 3rd Qu.:3.167 3rd Qu.:27.75
## Max. :4.333 Max. :33.00
p <- ggpairs(df, maaping = aes(col = gender, alpha = 0.3), lower = list(combo = wrap("facethist", bins = 20)))
## Warning in warn_if_args_exist(list(...)): Extra arguments: "maaping" are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
p
##The three explanatory variables are “attitude”, “strategy” and “surface”. These are choosen based on r value
qplot(attitude, points, data = df) + geom_smooth(method = "lm")
## fit a linear model
model <- lm(points ~1, data = df)
summary(model)
##
## Call:
## lm(formula = points ~ 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7169 -3.7169 0.2831 5.0331 10.2831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7169 0.4575 49.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.895 on 165 degrees of freedom
library(GGally)
ggpairs(df, lower = list(combo = wrap("facethist", bins = 20)))
model2<- lm(points ~ attitude + stra + surf, data = df)
summary(model2)
##
## Call:
## lm(formula = points ~ attitude + stra + surf, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1550 -3.4346 0.5156 3.6401 10.8952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.0171 3.6837 2.991 0.00322 **
## attitude 3.3952 0.5741 5.913 1.93e-08 ***
## stra 0.8531 0.5416 1.575 0.11716
## surf -0.5861 0.8014 -0.731 0.46563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared: 0.2074, Adjusted R-squared: 0.1927
## F-statistic: 14.13 on 3 and 162 DF, p-value: 3.156e-08
remove the surf because its p value was the highest
model3<- lm(points ~attitude + stra, data=df)
summary(model3)
##
## Call:
## lm(formula = points ~ attitude + stra, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.6436 -3.3113 0.5575 3.7928 10.9295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9729 2.3959 3.745 0.00025 ***
## attitude 3.4658 0.5652 6.132 6.31e-09 ***
## stra 0.9137 0.5345 1.709 0.08927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.289 on 163 degrees of freedom
## Multiple R-squared: 0.2048, Adjusted R-squared: 0.1951
## F-statistic: 20.99 on 2 and 163 DF, p-value: 7.734e-09
the most significant variable was attitude. Stra had only tendency
par(mfrow = c(2,2))
plot(model3, which = c(1,2,5))
QQ plot showed this model is normally distributed all looks good
alc <- read.csv("Z:\\IODS-project\\data\\alc.csv", sep = ",")
library(tidyr); library(dplyr); library(ggplot2)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(alc)
## Observations: 382
## Variables: 36
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob <fct> teacher, other, other, services, other, other, othe...
## $ reason <fct> course, course, other, home, home, reputation, home...
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1 <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2 <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3 <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...
gather(alc) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,752
## Variables: 2
## $ key <chr> "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "...
## $ value <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",...
gather(alc) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
library(dplyr)
library(ggplot2)
alc %>% group_by(sex) %>% summarise(count = n())
## # A tibble: 2 x 2
## sex count
## <fct> <int>
## 1 F 198
## 2 M 184
library(ggplot2)
g1 <- ggplot(alc, aes(x = high_use, y = G3, col = sex))
g1 + geom_boxplot() + ylab("grade")
g2 <- ggplot(alc, aes(x = high_use, y = absences, col = sex))
g2 + geom_boxplot() + ggtitle("Student absences by alcohol consumption and sex")
m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")
summary(m)
##
## Call:
## glm(formula = high_use ~ failures + absences + sex, family = "binomial",
## data = alc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1855 -0.8371 -0.6000 1.1020 2.0209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.90297 0.22626 -8.411 < 2e-16 ***
## failures 0.45082 0.18992 2.374 0.017611 *
## absences 0.09322 0.02295 4.063 4.85e-05 ***
## sexM 0.94117 0.24200 3.889 0.000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 424.40 on 378 degrees of freedom
## AIC: 432.4
##
## Number of Fisher Scoring iterations: 4
coef(m)
## (Intercept) failures absences sexM
## -1.90296550 0.45081981 0.09321999 0.94116602
m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")
OR <- coef(m) %>% exp
CI <- confint(m) %>% exp
## Waiting for profiling to be done...
cbind(OR, CI)
## OR 2.5 % 97.5 %
## (Intercept) 0.1491257 0.09395441 0.228611
## failures 1.5695984 1.08339644 2.294737
## absences 1.0977032 1.05169654 1.150848
## sexM 2.5629682 1.60381392 4.149405
m <- glm(high_use ~ failures + absences + sex, data = alc, family = "binomial")
probabilities <- predict(m, type = "response")
alc <- mutate(alc, probability = probabilities)
alc <- mutate(alc, prediction = probability > 0.5)
select(alc, failures, absences, sex, high_use, probability, prediction) %>% tail(10)
## failures absences sex high_use probability prediction
## 373 1 0 M FALSE 0.3749639 FALSE
## 374 1 7 M TRUE 0.5353311 TRUE
## 375 0 1 F FALSE 0.1406689 FALSE
## 376 0 6 F FALSE 0.2069112 FALSE
## 377 1 2 F FALSE 0.2199932 FALSE
## 378 0 2 F FALSE 0.1523192 FALSE
## 379 2 2 F FALSE 0.3068503 FALSE
## 380 0 3 F FALSE 0.1647495 FALSE
## 381 0 4 M TRUE 0.3568828 FALSE
## 382 0 2 M TRUE 0.3153209 FALSE
table(high_use = alc$high_use, prediction = alc$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 84 30
library(dplyr); library(ggplot2)
g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))
g + geom_point()
table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67801047 0.02356021 0.70157068
## TRUE 0.21989529 0.07853403 0.29842932
## Sum 0.89790576 0.10209424 1.00000000
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.2434555
library(boot)
cv <- cv.glm(data = alc, cost = loss_func, glmfit = m, K = 10)
cv$delta[1]
## [1] 0.2591623
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Boston")
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(tidyr)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
cor_matrix<-cor(Boston) %>% round(digits = 2 )
cor_matrix
## crim zn indus chas nox rm age dis rad tax
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29
## age 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51
## dis -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53
## rad 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91
## tax 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00
## ptratio 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46
## black -0.39 0.18 -0.36 0.05 -0.38 0.13 -0.27 0.29 -0.44 -0.44
## lstat 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54
## medv -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47
## ptratio black lstat medv
## crim 0.29 -0.39 0.46 -0.39
## zn -0.39 0.18 -0.41 0.36
## indus 0.38 -0.36 0.60 -0.48
## chas -0.12 0.05 -0.05 0.18
## nox 0.19 -0.38 0.59 -0.43
## rm -0.36 0.13 -0.61 0.70
## age 0.26 -0.27 0.60 -0.38
## dis -0.23 0.29 -0.50 0.25
## rad 0.46 -0.44 0.49 -0.38
## tax 0.46 -0.44 0.54 -0.47
## ptratio 1.00 -0.18 0.37 -0.51
## black -0.18 1.00 -0.37 0.33
## lstat 0.37 -0.37 1.00 -0.74
## medv -0.51 0.33 -0.74 1.00
corrplot(cor_matrix, method="circle", type="upper", cl.pos="b", tl.pos="d", tl.cex = 0.6)
It seems that “dis” is closely related to “age”, “nox”, “indus” and “zn”.
boston_scaled <- scale(Boston)
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
class(boston_scaled)
## [1] "matrix"
boston_scaled <- as.data.frame(boston_scaled)
summary(boston_scaled$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.419367 -0.410563 -0.390280 0.000000 0.007389 9.924110
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
table(crime)
## crime
## low med_low med_high high
## 127 126 126 127
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(Boston)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)
lda.fit <- lda(crime ~ ., data = train)
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2698020 0.2450495 0.2400990 0.2450495
##
## Group means:
## zn indus chas nox rm
## low 0.98612050 -0.8924018 -0.09172814 -0.8858568 0.47353561
## med_low -0.05587021 -0.3276933 -0.03371693 -0.5590893 -0.14006514
## med_high -0.38822494 0.1504573 0.17414622 0.3174855 0.06525623
## high -0.48724019 1.0171737 -0.03371693 1.0334992 -0.42577902
## age dis rad tax ptratio
## low -0.8726099 0.8248007 -0.6794767 -0.7563112 -0.48247146
## med_low -0.3636924 0.3663021 -0.5514861 -0.4975961 -0.08117323
## med_high 0.3267732 -0.3297702 -0.4029017 -0.3163061 -0.20374675
## high 0.7950462 -0.8610240 1.6375616 1.5136504 0.78011702
## black lstat medv
## low 0.37759900 -0.76378783 0.59115548
## med_low 0.30307953 -0.15960231 0.01466445
## med_high 0.09171584 -0.02482685 0.17757471
## high -0.77885925 0.86989690 -0.69295838
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.12819824 0.86206517 -0.89600261
## indus 0.02935284 -0.13432844 0.14537970
## chas -0.06905987 -0.06647042 0.09313964
## nox 0.36719955 -0.78015032 -1.40445803
## rm -0.09577365 -0.03382529 -0.18392304
## age 0.23858932 -0.26321002 -0.10201653
## dis -0.09662041 -0.42953998 0.19809773
## rad 3.13115233 1.16642529 -0.09574155
## tax 0.03004027 -0.34744326 0.71440042
## ptratio 0.11716773 0.02801709 -0.32043792
## black -0.11979814 0.02612750 0.11399064
## lstat 0.23480319 -0.22492901 0.29751841
## medv 0.18640433 -0.39919765 -0.29792133
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9529 0.0361 0.0110
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
lda.pred <- predict(lda.fit, newdata = test)
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 11 7 0 0
## med_low 3 17 7 0
## med_high 0 6 22 1
## high 0 0 0 28
library(MASS)
data('Boston')
boston_scaled <- scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
dist_eu <- dist(boston_scaled)
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
km <-kmeans(boston_scaled, centers = 3)
pairs(boston_scaled, col = km$cluster)
set.seed(123)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss})
library(ggplot2)
qplot(x = 1:k_max, y = twcss, geom = 'line')
km <-kmeans(boston_scaled, centers = 2)
pairs(boston_scaled, col = km$cluster)
library(GGally)
ggpairs(boston_scaled)
model_predictors <- dplyr::select(train, -crime)
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime )
km3D <-kmeans(boston_scaled, centers = 4)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km3D$cluster[ind])
human <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human2.txt", sep =",", header = T)
str(human)
## 'data.frame': 155 obs. of 8 variables:
## $ Edu2.FM : num 1.007 0.997 0.983 0.989 0.969 ...
## $ Labo.FM : num 0.891 0.819 0.825 0.884 0.829 ...
## $ Edu.Exp : num 17.5 20.2 15.8 18.7 17.9 16.5 18.6 16.5 15.9 19.2 ...
## $ Life.Exp : num 81.6 82.4 83 80.2 81.6 80.9 80.9 79.1 82 81.8 ...
## $ GNI : int 64992 42261 56431 44025 45435 43919 39568 52947 42155 32689 ...
## $ Mat.Mor : int 4 6 6 5 6 7 9 28 11 8 ...
## $ Ado.Birth: num 7.8 12.1 1.9 5.1 6.2 3.8 8.2 31 14.5 25.3 ...
## $ Parli.F : num 39.6 30.5 28.5 38 36.9 36.9 19.9 19.4 28.2 31.4 ...
dim(human)
## [1] 155 8
summary(human)
## Edu2.FM Labo.FM Edu.Exp Life.Exp
## Min. :0.1717 Min. :0.1857 Min. : 5.40 Min. :49.00
## 1st Qu.:0.7264 1st Qu.:0.5984 1st Qu.:11.25 1st Qu.:66.30
## Median :0.9375 Median :0.7535 Median :13.50 Median :74.20
## Mean :0.8529 Mean :0.7074 Mean :13.18 Mean :71.65
## 3rd Qu.:0.9968 3rd Qu.:0.8535 3rd Qu.:15.20 3rd Qu.:77.25
## Max. :1.4967 Max. :1.0380 Max. :20.20 Max. :83.50
## GNI Mat.Mor Ado.Birth Parli.F
## Min. : 581 Min. : 1.0 Min. : 0.60 Min. : 0.00
## 1st Qu.: 4198 1st Qu.: 11.5 1st Qu.: 12.65 1st Qu.:12.40
## Median : 12040 Median : 49.0 Median : 33.60 Median :19.30
## Mean : 17628 Mean : 149.1 Mean : 47.16 Mean :20.91
## 3rd Qu.: 24512 3rd Qu.: 190.0 3rd Qu.: 71.95 3rd Qu.:27.95
## Max. :123124 Max. :1100.0 Max. :204.80 Max. :57.50
library(MASS) library(tidyr) library(dplyr)
cor(human)
## Edu2.FM Labo.FM Edu.Exp Life.Exp GNI
## Edu2.FM 1.000000000 0.009564039 0.59325156 0.5760299 0.43030485
## Labo.FM 0.009564039 1.000000000 0.04732183 -0.1400125 -0.02173971
## Edu.Exp 0.593251562 0.047321827 1.00000000 0.7894392 0.62433940
## Life.Exp 0.576029853 -0.140012504 0.78943917 1.0000000 0.62666411
## GNI 0.430304846 -0.021739705 0.62433940 0.6266641 1.00000000
## Mat.Mor -0.660931770 0.240461075 -0.73570257 -0.8571684 -0.49516234
## Ado.Birth -0.529418415 0.120158862 -0.70356489 -0.7291774 -0.55656208
## Parli.F 0.078635285 0.250232608 0.20608156 0.1700863 0.08920818
## Mat.Mor Ado.Birth Parli.F
## Edu2.FM -0.6609318 -0.5294184 0.07863528
## Labo.FM 0.2404611 0.1201589 0.25023261
## Edu.Exp -0.7357026 -0.7035649 0.20608156
## Life.Exp -0.8571684 -0.7291774 0.17008631
## GNI -0.4951623 -0.5565621 0.08920818
## Mat.Mor 1.0000000 0.7586615 -0.08944000
## Ado.Birth 0.7586615 1.0000000 -0.07087810
## Parli.F -0.0894400 -0.0708781 1.00000000
library(GGally)
library(ggplot2)
library(dplyr)
library(tidyr)
ggpairs(human)
library(ggplot2)
library(corrplot)
cor(human) %>% corrplot( method="circle", type="upper", title= "Graph 1. Correlations between variables in human data")
4 hypothesis 1) negative relationship between Edu2.FM and Mat.Mor 2) negative relationship between Edu.Exp and Mat.Mor 3) negative relationship between Life.Exp and Mat.Mor 4) negative relationship between GNI and Mat.Mor
pca_human <- prcomp(human)
pca_human
## Standard deviations (1, .., p=8):
## [1] 1.854416e+04 1.855219e+02 2.518701e+01 1.145441e+01 3.766241e+00
## [6] 1.565912e+00 1.912052e-01 1.591112e-01
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## Edu2.FM -5.607472e-06 0.0006713951 -3.412027e-05 -2.736326e-04
## Labo.FM 2.331945e-07 -0.0002819357 5.302884e-04 -4.692578e-03
## Edu.Exp -9.562910e-05 0.0075529759 1.427664e-02 -3.313505e-02
## Life.Exp -2.815823e-04 0.0283150248 1.294971e-02 -6.752684e-02
## GNI -9.999832e-01 -0.0057723054 -5.156742e-04 4.932889e-05
## Mat.Mor 5.655734e-03 -0.9916320120 1.260302e-01 -6.100534e-03
## Ado.Birth 1.233961e-03 -0.1255502723 -9.918113e-01 5.301595e-03
## Parli.F -5.526460e-05 0.0032317269 -7.398331e-03 -9.971232e-01
## PC5 PC6 PC7 PC8
## Edu2.FM -0.0022935252 2.180183e-02 6.998623e-01 7.139410e-01
## Labo.FM 0.0022190154 3.264423e-02 7.132267e-01 -7.001533e-01
## Edu.Exp 0.1431180282 9.882477e-01 -3.826887e-02 7.776451e-03
## Life.Exp 0.9865644425 -1.453515e-01 5.380452e-03 2.281723e-03
## GNI -0.0001135863 -2.711698e-05 -8.075191e-07 -1.176762e-06
## Mat.Mor 0.0266373214 1.695203e-03 1.355518e-04 8.371934e-04
## Ado.Birth 0.0188618600 1.273198e-02 -8.641234e-05 -1.707885e-04
## Parli.F -0.0716401914 -2.309896e-02 -2.642548e-03 2.680113e-03
biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col= c("grey40", "deeppink2"), main = "Grpagh 2. PCA with human data, unscaled")
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
It is very hard to see
human.std <- scale(human)
pca_human2 <- prcomp(human.std)
biplot(pca_human2, choices= 1:2, cex = c(0.8, 1), col= c("grey40", "deeppink2"), main="Graph 3. PCA with human data, scaled")
Not clear so we will change it
s <- summary(pca_human2)
pca_pr <- round(100*s$importance[2, ], digits = 1)
pc_lab <- paste0(names(pca_pr), " (", pca_pr, "%)")
biplot(pca_human2, cex = c(0.8, 1), col = c("grey40", "deeppink2"), xlab = pc_lab[1], ylab = pc_lab[2], main="Graph 4. PCA with human data, scaled, %")
looks good but still hard to see
library(FactoMineR)
es.pca = PCA(human, scale.unit= TRUE, ncp=5)
Much better now
library(ggplot2)
library(dplyr)
library(tidyr)
library(FactoMineR)
data(tea)
my_tea <- data.frame(tea)
dim(my_tea)
## [1] 300 36
str(my_tea)
## 'data.frame': 300 obs. of 36 variables:
## $ breakfast : Factor w/ 2 levels "breakfast","Not.breakfast": 1 1 2 2 1 2 1 2 1 1 ...
## $ tea.time : Factor w/ 2 levels "Not.tea time",..: 1 1 2 1 1 1 2 2 2 1 ...
## $ evening : Factor w/ 2 levels "evening","Not.evening": 2 2 1 2 1 2 2 1 2 1 ...
## $ lunch : Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
## $ dinner : Factor w/ 2 levels "dinner","Not.dinner": 2 2 1 1 2 1 2 2 2 2 ...
## $ always : Factor w/ 2 levels "always","Not.always": 2 2 2 2 1 2 2 2 2 2 ...
## $ home : Factor w/ 2 levels "home","Not.home": 1 1 1 1 1 1 1 1 1 1 ...
## $ work : Factor w/ 2 levels "Not.work","work": 1 1 2 1 1 1 1 1 1 1 ...
## $ tearoom : Factor w/ 2 levels "Not.tearoom",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ friends : Factor w/ 2 levels "friends","Not.friends": 2 2 1 2 2 2 1 2 2 2 ...
## $ resto : Factor w/ 2 levels "Not.resto","resto": 1 1 2 1 1 1 1 1 1 1 ...
## $ pub : Factor w/ 2 levels "Not.pub","pub": 1 1 1 1 1 1 1 1 1 1 ...
## $ Tea : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
## $ How : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
## $ sugar : Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
## $ how : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ where : Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ price : Factor w/ 6 levels "p_branded","p_cheap",..: 4 6 6 6 6 3 6 6 5 5 ...
## $ age : int 39 45 47 23 48 21 37 36 40 37 ...
## $ sex : Factor w/ 2 levels "F","M": 2 1 1 2 2 2 2 1 2 2 ...
## $ SPC : Factor w/ 7 levels "employee","middle",..: 2 2 4 6 1 6 5 2 5 5 ...
## $ Sport : Factor w/ 2 levels "Not.sportsman",..: 2 2 2 1 2 2 2 2 2 1 ...
## $ age_Q : Factor w/ 5 levels "15-24","25-34",..: 3 4 4 1 4 1 3 3 3 3 ...
## $ frequency : Factor w/ 4 levels "1/day","1 to 2/week",..: 1 1 3 1 3 1 4 2 3 3 ...
## $ escape.exoticism: Factor w/ 2 levels "escape-exoticism",..: 2 1 2 1 1 2 2 2 2 2 ...
## $ spirituality : Factor w/ 2 levels "Not.spirituality",..: 1 1 1 2 2 1 1 1 1 1 ...
## $ healthy : Factor w/ 2 levels "healthy","Not.healthy": 1 1 1 1 2 1 1 1 2 1 ...
## $ diuretic : Factor w/ 2 levels "diuretic","Not.diuretic": 2 1 1 2 1 2 2 2 2 1 ...
## $ friendliness : Factor w/ 2 levels "friendliness",..: 2 2 1 2 1 2 2 1 2 1 ...
## $ iron.absorption : Factor w/ 2 levels "iron absorption",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ feminine : Factor w/ 2 levels "feminine","Not.feminine": 2 2 2 2 2 2 2 1 2 2 ...
## $ sophisticated : Factor w/ 2 levels "Not.sophisticated",..: 1 1 1 2 1 1 1 2 2 1 ...
## $ slimming : Factor w/ 2 levels "No.slimming",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ exciting : Factor w/ 2 levels "exciting","No.exciting": 2 1 2 2 2 2 2 2 2 2 ...
## $ relaxing : Factor w/ 2 levels "No.relaxing",..: 1 1 2 2 2 2 2 2 2 2 ...
## $ effect.on.health: Factor w/ 2 levels "effect on health",..: 2 2 2 2 2 2 2 2 2 2 ...
summary(my_tea)
## breakfast tea.time evening lunch
## breakfast :144 Not.tea time:131 evening :103 lunch : 44
## Not.breakfast:156 tea time :169 Not.evening:197 Not.lunch:256
##
##
##
##
##
## dinner always home work
## dinner : 21 always :103 home :291 Not.work:213
## Not.dinner:279 Not.always:197 Not.home: 9 work : 87
##
##
##
##
##
## tearoom friends resto pub
## Not.tearoom:242 friends :196 Not.resto:221 Not.pub:237
## tearoom : 58 Not.friends:104 resto : 79 pub : 63
##
##
##
##
##
## Tea How sugar how
## black : 74 alone:195 No.sugar:155 tea bag :170
## Earl Grey:193 lemon: 33 sugar :145 tea bag+unpackaged: 94
## green : 33 milk : 63 unpackaged : 36
## other: 9
##
##
##
## where price age sex
## chain store :192 p_branded : 95 Min. :15.00 F:178
## chain store+tea shop: 78 p_cheap : 7 1st Qu.:23.00 M:122
## tea shop : 30 p_private label: 21 Median :32.00
## p_unknown : 12 Mean :37.05
## p_upscale : 53 3rd Qu.:48.00
## p_variable :112 Max. :90.00
##
## SPC Sport age_Q frequency
## employee :59 Not.sportsman:121 15-24:92 1/day : 95
## middle :40 sportsman :179 25-34:69 1 to 2/week: 44
## non-worker :64 35-44:40 +2/day :127
## other worker:20 45-59:61 3 to 6/week: 34
## senior :35 +60 :38
## student :70
## workman :12
## escape.exoticism spirituality healthy
## escape-exoticism :142 Not.spirituality:206 healthy :210
## Not.escape-exoticism:158 spirituality : 94 Not.healthy: 90
##
##
##
##
##
## diuretic friendliness iron.absorption
## diuretic :174 friendliness :242 iron absorption : 31
## Not.diuretic:126 Not.friendliness: 58 Not.iron absorption:269
##
##
##
##
##
## feminine sophisticated slimming
## feminine :129 Not.sophisticated: 85 No.slimming:255
## Not.feminine:171 sophisticated :215 slimming : 45
##
##
##
##
##
## exciting relaxing effect.on.health
## exciting :116 No.relaxing:113 effect on health : 66
## No.exciting:184 relaxing :187 No.effect on health:234
##
##
##
##
##
gather(my_tea) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped
keep_columns <- c("breakfast", "sex", "price", "healthy", "spirituality", "friends", "Tea", "tearoom")
my_tea1 <- dplyr::select(my_tea, one_of(keep_columns))
gather(my_tea1) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped
mca <- MCA(my_tea1, graph = FALSE)
summary(mca)
##
## Call:
## MCA(X = my_tea1, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 0.193 0.165 0.150 0.147 0.139 0.128
## % of var. 11.885 10.164 9.218 9.031 8.544 7.859
## Cumulative % of var. 11.885 22.050 31.268 40.299 48.843 56.702
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## Variance 0.121 0.115 0.110 0.102 0.096 0.087
## % of var. 7.429 7.054 6.784 6.297 5.928 5.371
## Cumulative % of var. 64.131 71.185 77.970 84.267 90.195 95.565
## Dim.13
## Variance 0.072
## % of var. 4.435
## Cumulative % of var. 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | 0.767 1.015 0.144 | 0.347 0.242 0.029 | 0.713
## 2 | 0.261 0.117 0.057 | 0.285 0.164 0.069 | -0.327
## 3 | -0.400 0.276 0.233 | -0.236 0.113 0.081 | 0.005
## 4 | 0.057 0.006 0.003 | -0.199 0.080 0.034 | 0.209
## 5 | -0.005 0.000 0.000 | 0.067 0.009 0.003 | 0.484
## 6 | 0.641 0.709 0.171 | -0.080 0.013 0.003 | 0.234
## 7 | -0.151 0.039 0.028 | 0.019 0.001 0.000 | 0.020
## 8 | 0.262 0.118 0.059 | 0.161 0.052 0.022 | -0.149
## 9 | 0.533 0.491 0.180 | 0.465 0.437 0.137 | 0.375
## 10 | 0.503 0.436 0.118 | 1.130 2.577 0.594 | -0.477
## ctr cos2
## 1 1.132 0.125 |
## 2 0.238 0.090 |
## 3 0.000 0.000 |
## 4 0.097 0.037 |
## 5 0.521 0.164 |
## 6 0.122 0.023 |
## 7 0.001 0.001 |
## 8 0.050 0.019 |
## 9 0.313 0.089 |
## 10 0.505 0.106 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr
## breakfast | -0.002 0.000 0.000 -0.030 | 0.211 1.613
## Not.breakfast | 0.002 0.000 0.000 0.030 | -0.194 1.489
## F | -0.357 4.897 0.186 -7.458 | -0.173 1.347
## M | 0.521 7.144 0.186 7.458 | 0.253 1.965
## p_branded | 0.289 1.708 0.039 3.398 | -0.960 22.087
## p_cheap | 0.678 0.694 0.011 1.812 | 0.329 0.191
## p_private label | 0.723 2.369 0.039 3.431 | 0.274 0.398
## p_unknown | 0.192 0.095 0.002 0.677 | -0.040 0.005
## p_upscale | 0.563 3.621 0.068 4.507 | 1.183 18.709
## p_variable | -0.710 12.168 0.300 -9.471 | 0.187 0.986
## cos2 v.test Dim.3 ctr cos2 v.test
## breakfast 0.041 3.500 | -0.286 3.276 0.075 -4.751 |
## Not.breakfast 0.041 -3.500 | 0.264 3.024 0.075 4.751 |
## F 0.044 -3.618 | -0.242 2.905 0.086 -5.059 |
## M 0.044 3.618 | 0.353 4.238 0.086 5.059 |
## p_branded 0.427 -11.300 | -0.341 3.069 0.054 -4.012 |
## p_cheap 0.003 0.880 | 1.264 3.110 0.038 3.378 |
## p_private label 0.006 1.301 | 0.150 0.131 0.002 0.709 |
## p_unknown 0.000 -0.142 | 2.649 23.430 0.292 9.351 |
## p_upscale 0.300 9.475 | -0.267 1.049 0.015 -2.137 |
## p_variable 0.021 2.493 | 0.024 0.019 0.000 0.326 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## breakfast | 0.000 0.041 0.075 |
## sex | 0.186 0.044 0.086 |
## price | 0.319 0.560 0.369 |
## healthy | 0.010 0.044 0.413 |
## spirituality | 0.082 0.019 0.000 |
## friends | 0.412 0.000 0.000 |
## Tea | 0.272 0.340 0.162 |
## tearoom | 0.264 0.273 0.092 |
plot(mca, invisible=c("ind"), habillage="quali")
res.mca=MCA(my_tea1)
BPRS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/BPRS.txt", sep=" ", header=TRUE)
str(BPRS)
## 'data.frame': 40 obs. of 11 variables:
## $ treatment: int 1 1 1 1 1 1 1 1 1 1 ...
## $ subject : int 1 2 3 4 5 6 7 8 9 10 ...
## $ week0 : int 42 58 54 55 72 48 71 30 41 57 ...
## $ week1 : int 36 68 55 77 75 43 61 36 43 51 ...
## $ week2 : int 36 61 41 49 72 41 47 38 39 51 ...
## $ week3 : int 43 55 38 54 65 38 30 38 35 55 ...
## $ week4 : int 41 43 43 56 50 36 27 31 28 53 ...
## $ week5 : int 40 34 28 50 39 29 40 26 22 43 ...
## $ week6 : int 38 28 29 47 32 33 30 26 20 43 ...
## $ week7 : int 47 28 25 42 38 27 31 25 23 39 ...
## $ week8 : int 51 28 24 46 32 25 31 24 21 32 ...
summary(BPRS)
## treatment subject week0 week1
## Min. :1.0 Min. : 1.00 Min. :24.00 Min. :23.00
## 1st Qu.:1.0 1st Qu.: 5.75 1st Qu.:38.00 1st Qu.:35.00
## Median :1.5 Median :10.50 Median :46.00 Median :41.00
## Mean :1.5 Mean :10.50 Mean :48.00 Mean :46.33
## 3rd Qu.:2.0 3rd Qu.:15.25 3rd Qu.:58.25 3rd Qu.:54.25
## Max. :2.0 Max. :20.00 Max. :78.00 Max. :95.00
## week2 week3 week4 week5
## Min. :26.0 Min. :24.00 Min. :20.00 Min. :20.00
## 1st Qu.:32.0 1st Qu.:29.75 1st Qu.:28.00 1st Qu.:26.00
## Median :38.0 Median :36.50 Median :34.50 Median :30.50
## Mean :41.7 Mean :39.15 Mean :36.35 Mean :32.55
## 3rd Qu.:49.0 3rd Qu.:44.50 3rd Qu.:43.00 3rd Qu.:38.00
## Max. :75.0 Max. :76.00 Max. :66.00 Max. :64.00
## week6 week7 week8
## Min. :19.00 Min. :18.0 Min. :20.00
## 1st Qu.:22.75 1st Qu.:23.0 1st Qu.:22.75
## Median :28.50 Median :30.0 Median :28.00
## Mean :31.23 Mean :32.2 Mean :31.43
## 3rd Qu.:37.00 3rd Qu.:38.0 3rd Qu.:35.25
## Max. :64.00 Max. :62.0 Max. :75.00
These data are treatment and groups
library(dplyr)
library(tidyr)
library(ggplot2)
BPRS$treatment <- factor(BPRS$treatment)
BPRS$subject <- factor(BPRS$subject)
str(BPRS)
## 'data.frame': 40 obs. of 11 variables:
## $ treatment: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ subject : Factor w/ 20 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ week0 : int 42 58 54 55 72 48 71 30 41 57 ...
## $ week1 : int 36 68 55 77 75 43 61 36 43 51 ...
## $ week2 : int 36 61 41 49 72 41 47 38 39 51 ...
## $ week3 : int 43 55 38 54 65 38 30 38 35 55 ...
## $ week4 : int 41 43 43 56 50 36 27 31 28 53 ...
## $ week5 : int 40 34 28 50 39 29 40 26 22 43 ...
## $ week6 : int 38 28 29 47 32 33 30 26 20 43 ...
## $ week7 : int 47 28 25 42 38 27 31 25 23 39 ...
## $ week8 : int 51 28 24 46 32 25 31 24 21 32 ...
BPRSL <- BPRS %>% gather(key = weeks, value = bprs, -treatment, -subject)
BPRSL <- BPRSL %>% mutate(weeks = as.integer(substr(weeks, 5,5)))
glimpse(BPRSL)
## Observations: 360
## Variables: 4
## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ subject <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ weeks <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ bprs <int> 42, 58, 54, 55, 72, 48, 71, 30, 41, 57, 30, 55, 36, ...
RATS <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/MABS/master/Examples/data/rats.txt", sep="\t", header=TRUE)
str(RATS)
## 'data.frame': 16 obs. of 13 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Group: int 1 1 1 1 1 1 1 1 2 2 ...
## $ WD1 : int 240 225 245 260 255 260 275 245 410 405 ...
## $ WD8 : int 250 230 250 255 260 265 275 255 415 420 ...
## $ WD15 : int 255 230 250 255 255 270 260 260 425 430 ...
## $ WD22 : int 260 232 255 265 270 275 270 268 428 440 ...
## $ WD29 : int 262 240 262 265 270 275 273 270 438 448 ...
## $ WD36 : int 258 240 265 268 273 277 274 265 443 460 ...
## $ WD43 : int 266 243 267 270 274 278 276 265 442 458 ...
## $ WD44 : int 266 244 267 272 273 278 271 267 446 464 ...
## $ WD50 : int 265 238 264 274 276 284 282 273 456 475 ...
## $ WD57 : int 272 247 268 273 278 279 281 274 468 484 ...
## $ WD64 : int 278 245 269 275 280 281 284 278 478 496 ...
RATS$Group <- factor(RATS$Group)
RATS$ID <- factor(RATS$ID)
str(RATS)
## 'data.frame': 16 obs. of 13 variables:
## $ ID : Factor w/ 16 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Group: Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 2 2 ...
## $ WD1 : int 240 225 245 260 255 260 275 245 410 405 ...
## $ WD8 : int 250 230 250 255 260 265 275 255 415 420 ...
## $ WD15 : int 255 230 250 255 255 270 260 260 425 430 ...
## $ WD22 : int 260 232 255 265 270 275 270 268 428 440 ...
## $ WD29 : int 262 240 262 265 270 275 273 270 438 448 ...
## $ WD36 : int 258 240 265 268 273 277 274 265 443 460 ...
## $ WD43 : int 266 243 267 270 274 278 276 265 442 458 ...
## $ WD44 : int 266 244 267 272 273 278 271 267 446 464 ...
## $ WD50 : int 265 238 264 274 276 284 282 273 456 475 ...
## $ WD57 : int 272 247 268 273 278 279 281 274 468 484 ...
## $ WD64 : int 278 245 269 275 280 281 284 278 478 496 ...
RATSL <- RATS %>% gather(key = WDs, value = rats, -ID, -Group) %>%
mutate(Time = as.integer(substr(WDs, 3, 4)))
glimpse(RATSL)
## Observations: 176
## Variables: 5
## $ ID <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1,...
## $ WDs <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", ...
## $ rats <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 445, 5...
## $ Time <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8,...
write.csv(RATS, "data/rats_long.csv")
write.csv(BPRS, "data/bprs_long.csv")
ggplot(BPRSL, aes(x = weeks, y = bprs, linetype = subject)) +
geom_line() +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ treatment, labeller = label_both) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(BPRSL$bprs), max(BPRSL$bprs)))
BPRSL <- BPRSL %>%
group_by(weeks) %>%
mutate(stdbprs = (bprs - mean(bprs))/sd(bprs)) %>%
ungroup()
ggplot(BPRSL, aes(x = weeks, y = stdbprs, linetype = subject)) +
geom_line() +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ treatment, labeller = label_both) +
scale_y_continuous(name = "standardized bprs")
n <- BPRSL$weeks %>% unique() %>% length()
BPRSS <- BPRSL %>%
group_by(treatment, weeks) %>%
summarise( mean = mean(bprs), se = sd(bprs)/sqrt(n) ) %>%
ungroup()
glimpse(BPRSS)
## Observations: 18
## Variables: 4
## $ treatment <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2
## $ weeks <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8
## $ mean <dbl> 47.00, 46.80, 43.55, 40.90, 36.60, 32.70, 29.70, 29....
## $ se <dbl> 4.534468, 5.173708, 4.003617, 3.744626, 3.259534, 2....
Mean response profiles for the two treatment groups in the BPRS data.
ggplot(BPRSS, aes(x = weeks, y = mean, linetype = treatment, shape = treatment)) +
geom_line() +
scale_linetype_manual(values = c(1,2)) +
geom_point(size=3) +
scale_shape_manual(values = c(1,2)) +
#geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3) +
theme(legend.position = c(0.8,0.8)) +
scale_y_continuous(name = "mean(bprs) +/- se(bprs)")
RATSL<-gather(RATS, key=WDs, value=weight, WD1:WD64)%>%mutate(time= as.integer(substr(WDs,3,4)))
ggplot(RATSL, aes(x = time, y = weight, linetype = ID)) +
geom_line() +
scale_linetype_manual(values = rep(1:10, times=4)) +
facet_grid(. ~ Group, labeller = label_both) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(min(RATSL$weight), max(RATSL$weight)))
Standardise the weight
RATSL<- RATSL%>%group_by(time)%>%mutate(stdweight = (weight - mean(weight))/sd(weight))%>%ungroup()
glimpse(RATSL)
## Observations: 176
## Variables: 6
## $ ID <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1...
## $ WDs <chr> "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD1", "WD...
## $ weight <int> 240, 225, 245, 260, 255, 260, 275, 245, 410, 405, 44...
## $ time <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8...
## $ stdweight <dbl> -1.0011429, -1.1203857, -0.9613953, -0.8421525, -0.8...
p1<-ggplot(RATSL, aes(x=time, y=stdweight, linetype=ID))
p2<-p1+geom_line()+scale_linetype_manual(values=rep(1:10, times=4))
p3<-p2+facet_grid(. ~Group, labeller = label_both)
p4<-p3+theme_bw()+theme(legend.position ="none")
p5<-p4+theme(panel.grid.minor.y=element_blank())
p6<-p5+scale_y_continuous(name="standardized weights")
p6
Numberof times, baseline (time 0)
n<-RATSL$time%>%unique()%>%length()
Make a summary data
RATSS<-RATSL%>%group_by(Group, time)%>% summarise(mean=mean(weight), se=sd(weight)/sqrt(n))%>%ungroup()
glimpse(RATSS)
## Observations: 33
## Variables: 4
## $ Group <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ time <int> 1, 8, 15, 22, 29, 36, 43, 44, 50, 57, 64, 1, 8, 15, 22, ...
## $ mean <dbl> 250.625, 255.000, 254.375, 261.875, 264.625, 265.000, 26...
## $ se <dbl> 4.589478, 3.947710, 3.460116, 4.100800, 3.333956, 3.5529...
Visualize the mean weight and SE
p1<-ggplot(RATSS, aes(x=time, y=mean, linetype=Group, shape=Group))
p2<-p1+geom_line()+scale_linetype_manual(values=c(1,2,3))
p3<-p2+geom_point(size=3)+scale_shape_manual(values=c(1,2,3))
p4<-p3+geom_errorbar(aes(ymin=mean-se, ymax=mean+se, linetype="1"), width=0.3)
p5<-p4+theme_bw()+theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
p6<-p5+theme(legend.position= "top")
p7<-p6+scale_y_continuous(name="mean(weights) +/- se(weights)")
p7